jupyter: python3¶

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import os
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches
import umap
import numpy as np

from CryptoFraudDetection.utils import embedding
from CryptoFraudDetection.utils import enums
from CryptoFraudDetection.utils import logger
In [2]:
LOGGER = logger.Logger(name=__name__, level=enums.LoggerMode.INFO, log_dir="../logs")
In [3]:
df = pd.read_parquet("../data/processed/x_posts.parquet")
df.head(5)
Out[3]:
username tweet timestamp likes impressions comments reposts bookmarks searchkeyword
0 @officialmcafee Bitcoin now at $16,600.00. Those of you in the... 2017-12-08T01:09:27.000Z 6200.0 0.0 1259.0 4518.0 486.0 Bitcoin
1 @CharlieShrem Time to buy #bitcoin 2018-02-01T16:37:51.000Z 17000.0 0.0 469.0 6858.0 16.0 Bitcoin
2 @rogerkver I just bought 50 Bitcoin (BCH) ATM machines fo... 2018-02-13T00:53:30.000Z 2000.0 0.0 778.0 701.0 2.0 Bitcoin
3 @rogerkver Who wants a Bitcoin Cash Visa debit card? htt... 2017-12-11T16:12:51.000Z 4300.0 0.0 765.0 1929.0 2.0 Bitcoin
4 @ARealHyena I liked a \n@YouTube\n video http://youtu.be/i... 2018-02-27T23:59:37.000Z 0.0 0.0 0.0 0.0 0.0 Bitcoin
In [4]:
df["searchkeyword"].value_counts()
Out[4]:
searchkeyword
Bitcoin       4405
Ethereum      3075
Chainlink     1912
thorchain     1640
$Atom         1558
Bitforex      1543
Terra Luna    1383
$Avax         1341
$FTT          1181
Safemoon       854
$STA           372
Beercoin       270
Teddy Doge      47
Name: count, dtype: int64
In [5]:
#sum of na in every col

df.isna().sum()
Out[5]:
username         0
tweet            0
timestamp        0
likes            0
impressions      0
comments         0
reposts          0
bookmarks        0
searchkeyword    0
dtype: int64
In [6]:
#empty strings in every col

(df == "").sum()
Out[6]:
username           4
tweet            272
timestamp          0
likes              0
impressions        0
comments           0
reposts            0
bookmarks          0
searchkeyword      0
dtype: int64
In [7]:
#pritn every user with empty string in username

df[df["username"] == ""]
Out[7]:
username tweet timestamp likes impressions comments reposts bookmarks searchkeyword
3369 . \n@beeple\n fully unicode spec compliant, th... 2021-08-26T03:30:24.000Z 8.0 0.0 0.0 2.0 0.0 Ethereum
7521 you're not an investor, you're a tokenholder 2023-10-27T20:58:19.000Z 0.0 20.0 0.0 0.0 0.0 Chainlink
16925 2023-07-14T02:29:05.000Z 0.0 15.0 0.0 0.0 0.0 Chainlink
16992 staking v0.2 has semi-slashing commence bull t... 2023-09-04T05:24:02.000Z 1.0 234.0 2.0 0.0 0.0 Chainlink
In [8]:
#print empty tweets

df[df["tweet"] == ""]
Out[8]:
username tweet timestamp likes impressions comments reposts bookmarks searchkeyword
389 @2357_is_prime 2019-09-16T17:50:04.000Z 0.0 0.0 0.0 0.0 0.0 Bitcoin
472 @cryptorick_ 2019-05-05T09:53:01.000Z 1.0 0.0 0.0 0.0 0.0 Bitcoin
486 @PayneFullHuman 2019-05-19T13:39:31.000Z 1.0 0.0 0.0 0.0 0.0 Bitcoin
617 @BitcoinWanda 2020-11-07T10:43:52.000Z 2.0 0.0 0.0 0.0 0.0 Bitcoin
776 @Bitcoin21oooooo 2022-10-25T14:05:39.000Z 1.0 0.0 0.0 0.0 0.0 Bitcoin
... ... ... ... ... ... ... ... ... ...
17913 @LiamSolo42 2024-08-28T18:39:59.000Z 0.0 49.0 0.0 0.0 0.0 thorchain
17928 @f1uffypaws 2022-10-22T21:32:53.000Z 0.0 38.0 0.0 0.0 0.0 thorchain
17959 @OliverLaFarge 2024-01-05T15:12:26.000Z 1.0 132.0 0.0 0.0 0.0 thorchain
17985 @matty_dot_thor 2024-09-24T02:56:04.000Z 2.0 75.0 0.0 0.0 0.0 thorchain
19064 @bitforexcom 2023-10-06T07:33:57.000Z 0.0 5.0 0.0 0.0 0.0 Bitforex

272 rows × 9 columns

In [9]:
empty_string_counts = df[df['tweet'] == ''].groupby('searchkeyword').size()

for keyword, count in empty_string_counts.items():
    print(f"{keyword}: {count} leere Strings im Tweet-Feld")
Beercoin: 2 leere Strings im Tweet-Feld
Bitcoin: 51 leere Strings im Tweet-Feld
Bitforex: 2 leere Strings im Tweet-Feld
Chainlink: 81 leere Strings im Tweet-Feld
Ethereum: 13 leere Strings im Tweet-Feld
Safemoon: 64 leere Strings im Tweet-Feld
Terra Luna: 9 leere Strings im Tweet-Feld
thorchain: 50 leere Strings im Tweet-Feld
In [10]:
#delete empty strings in tweet

df = df[df["tweet"] != ""]
In [11]:
value_counts = df['searchkeyword'].value_counts().reset_index()
value_counts.columns = ['searchkeyword', 'count']

plt.style.use('dark_background')

sns.set_theme(style="dark")

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=value_counts,
    x='searchkeyword',
    y='count',
    hue='searchkeyword',
    dodge=False,
    palette='viridis',
    legend=False
)

plt.xlabel('Search Keyword', color='white')
plt.ylabel('Anzahl', color='white')
plt.title('Count of Search Keywords', color='white')

plt.xticks(rotation=45, ha='right', color='white')
plt.yticks(color='white')

ax.set_facecolor('black')
fig = plt.gcf()
fig.patch.set_facecolor('black')

for spine in ax.spines.values():
    spine.set_color('white')

ax.tick_params(colors='white', which='both')

ax.grid(True, color='gray', linestyle='--', linewidth=0.5)

for p in ax.patches:
    height = p.get_height()
    ax.text(
        x=p.get_x() + p.get_width() / 2,
        y=height + 0.02 * max(value_counts['count']),
        s=f'{int(height)}',
        ha='center',
        color='white',
        fontsize=10
    )

plt.tight_layout()
plt.show()

This plot shows the count of Tweets per Coin in the scraped X Dataset. Some Coins have not that much Tweets, but thats because they also have a shorter Price-Timeseries which we use as start and enddate for scraping.

In [12]:
output_file = "../data/processed/x_posts_embeddings.parquet"

tqdm.pandas(desc="Embedding Progress")

embedder = embedding.Embedder(LOGGER)

def generate_embeddings(df):
    tweets = df['tweet'].tolist()
    
    embeddings = embedder.embed(tweets)
    
    df['embedding'] = embeddings
    return df

if not os.path.exists(output_file):
    print("Die Datei existiert nicht. Berechne Embeddings...")
    df = generate_embeddings(df)
    keyword_to_coin = {
    'Bitcoin': 'Bitcoin',
    'Ethereum': 'Ethereum',
    'Chainlink': 'Chainlink',
    'thorchain': 'THORChain',
    '$Atom': 'Cosmos',
    'Bitforex': 'BitForex',
    '$Avax': 'Avalanche',
    'Terra Luna': 'Terra Luna',
    '$FTT': 'FTX Token',
    'Safemoon': 'Safe Moon',
    '$STA': 'STOA Network',
    'Beercoin': 'BeerCoin',
    'Teddy Doge': 'Teddy Doge'
}

    # Werte in der Spalte 'searchkeyword' ersetzen
    df['searchkeyword'] = df['searchkeyword'].replace(keyword_to_coin)
    df.to_parquet(output_file)
    print(f"Embeddings gespeichert unter: {output_file}")
else:
    print(f"Datei existiert bereits: {output_file}. Lade die Datei...")
    df = pd.read_parquet(output_file)
    print("DataFrame erfolgreich geladen.")
config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\huggingface_hub\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\can-e\.cache\huggingface\hub\models--jinaai--jina-embeddings-v2-small-en. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  warnings.warn(message)
model.safetensors:   0%|          | 0.00/65.4M [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]
vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]
Datei existiert bereits: ../data/processed/x_posts_embeddings.parquet. Lade die Datei...
DataFrame erfolgreich geladen.
In [13]:
df.head(5)
Out[13]:
username tweet timestamp likes impressions comments reposts bookmarks searchkeyword embedding
0 @officialmcafee Bitcoin now at $16,600.00. Those of you in the... 2017-12-08T01:09:27.000Z 6200.0 0.0 1259.0 4518.0 486.0 Bitcoin [-0.89842653, -0.46295443, 0.3335528, 0.286954...
1 @CharlieShrem Time to buy #bitcoin 2018-02-01T16:37:51.000Z 17000.0 0.0 469.0 6858.0 16.0 Bitcoin [-0.6538949, -0.40458962, 0.34390834, 0.860342...
2 @rogerkver I just bought 50 Bitcoin (BCH) ATM machines fo... 2018-02-13T00:53:30.000Z 2000.0 0.0 778.0 701.0 2.0 Bitcoin [-0.6145951, -0.06576121, -0.09537013, 0.43106...
3 @rogerkver Who wants a Bitcoin Cash Visa debit card? htt... 2017-12-11T16:12:51.000Z 4300.0 0.0 765.0 1929.0 2.0 Bitcoin [-0.16293974, -0.23298289, 0.1674211, 0.296802...
4 @ARealHyena I liked a \n@YouTube\n video http://youtu.be/i... 2018-02-27T23:59:37.000Z 0.0 0.0 0.0 0.0 0.0 Bitcoin [-0.43297186, 0.010015366, -0.025700409, 0.749...
In [14]:
df.columns
Out[14]:
Index(['username', 'tweet', 'timestamp', 'likes', 'impressions', 'comments',
       'reposts', 'bookmarks', 'searchkeyword', 'embedding'],
      dtype='object')
In [15]:
coin_test = ['FTX Token', 'Safe Moon', 'Ethereum', 'Cosmos']

#cut out test coins

df = df[~df['searchkeyword'].isin(coin_test)]
In [16]:
with open('../data/raw/coins.json', 'r') as f:
    coins_data = json.load(f)

coins_info_df = pd.DataFrame(coins_data)





merged_df = df.merge(coins_info_df, left_on="searchkeyword", right_on="name", how="left")

embeddings = np.vstack(merged_df["embedding"].values)
fraud_labels = merged_df["fraud"]
In [17]:
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)

umap_reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_umap = umap_reducer.fit_transform(embeddings)
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
In [18]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

colors = ['red' if fraud else 'blue' for fraud in fraud_labels]

axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.25, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")

axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.15, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")

fraud_legend = [
    plt.Line2D([0], [0], marker='o', color='w', label='Fraud', markerfacecolor='red', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', label='Non-Fraud', markerfacecolor='blue', markersize=10)
]
fig.legend(handles=fraud_legend, loc="upper right")

plt.tight_layout()
plt.show()

Here we can see the embeddings plottet as pca and umap top 2 componnents. The color represents if the tweet was about a scam or non scam coin.There are clusters of Scam and non-Scam Embeddings visible, but it could be that these are just Tweets about the same coin.

In [19]:
coin_labels = merged_df["searchkeyword"].values
unique_coins = np.unique(coin_labels)
coin_colors = {coin: plt.cm.tab10(i / len(unique_coins)) for i, coin in enumerate(unique_coins)}
colors = [coin_colors[coin] for coin in coin_labels]

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")

axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")

legend_elements = [
    plt.Line2D([0], [0], marker='o', color='w', label=coin, markerfacecolor=coin_colors[coin], markersize=10)
    for coin in unique_coins
]
fig.legend(handles=legend_elements, loc="upper right", title="Coins")

plt.tight_layout()
plt.show()

Here we can see the same plot but with the color representing the coin. We can see that the embeddings are clustered by coin. So its not directly possible to say that the embeddings are clustered by scam or non scam.

In [20]:
unique_keywords = df['searchkeyword'].unique()

for keyword in unique_keywords:
    keyword_df = df[df['searchkeyword'] == keyword]
    
    embeddings = np.vstack(keyword_df['embedding'].values)

    pca = PCA(n_components=2)
    embeddings_pca = pca.fit_transform(embeddings)

    umap_reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
    embeddings_umap = umap_reducer.fit_transform(embeddings)

    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], alpha=0.7, edgecolor='k')
    axes[0].set_title(f"PCA of Embeddings for {keyword}")
    axes[0].set_xlabel("PCA Component 1")
    axes[0].set_ylabel("PCA Component 2")

    axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], alpha=0.7, edgecolor='k')
    axes[1].set_title(f"UMAP of Embeddings for {keyword}")
    axes[1].set_xlabel("UMAP Dimension 1")
    axes[1].set_ylabel("UMAP Dimension 2")

    plt.tight_layout()
    plt.show()
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

here we can see the same but for every coin by itself. Scam Coins tend to have a more dense cluster of embeddings, but then many clusters. Non Scam coins have only one big cluster but with many outliers. But its also just an tendency and some scam coins have only one cluster and some non scam coins have many clusters.

In [21]:
unique_keywords = df['searchkeyword'].unique()

def calculate_metrics(embeddings):
    pairwise_distances = pdist(embeddings, metric='cosine')
    avg_pairwise_distance = pairwise_distances.mean()

    knn = NearestNeighbors(n_neighbors=12)
    knn.fit(embeddings)
    distances, _ = knn.kneighbors(embeddings)
    avg_local_density = distances.mean()

    multivariate_std = np.std(embeddings, axis=0).mean()

    return avg_pairwise_distance, avg_local_density, multivariate_std

with open('../data/raw/coins.json', 'r') as f:
    coins_data = json.load(f)

coins_info_df = pd.DataFrame(coins_data)

merged_df = df.merge(coins_info_df[['name', 'fraud']], left_on="searchkeyword", right_on="name", how="left")

if 'fraud' not in merged_df.columns:
    print("Die 'fraud'-Spalte konnte nicht hinzugefügt werden. Bitte überprüfen Sie die Zuordnung.")

unique_keywords = merged_df['searchkeyword'].unique()
metrics_data = []

for keyword in unique_keywords:
    keyword_df = merged_df[merged_df['searchkeyword'] == keyword]

    fraud_values = keyword_df[keyword_df['fraud'] == True]['embedding'].values
    non_fraud_values = keyword_df[keyword_df['fraud'] == False]['embedding'].values

    if len(fraud_values) > 0:
        fraud_embeddings = np.vstack(fraud_values)
        fraud_metrics = calculate_metrics(fraud_embeddings)
    else:
        fraud_metrics = (0, 0, 0)

    if len(non_fraud_values) > 0:
        non_fraud_embeddings = np.vstack(non_fraud_values)
        non_fraud_metrics = calculate_metrics(non_fraud_embeddings)
    else:
        non_fraud_metrics = (0, 0, 0)

    metrics_data.append({
        'keyword': keyword,
        'fraud': fraud_metrics,
        'non_fraud': non_fraud_metrics
    })

fig, axes = plt.subplots(3, 1, figsize=(12, 18))

metric_names = ['Average Pairwise Distance', 'Average Local Density', 'Multivariate Standard Deviation']

for i, metric_name in enumerate(metric_names):
    fraud_values = [entry['fraud'][i] for entry in metrics_data]
    non_fraud_values = [entry['non_fraud'][i] for entry in metrics_data]
    keywords = [entry['keyword'] for entry in metrics_data]

    bar_width = 0.4
    x = np.arange(len(keywords))

    axes[i].bar(x - bar_width / 2, fraud_values, width=bar_width, label='Fraud', color='red', alpha=0.7)
    axes[i].bar(x + bar_width / 2, non_fraud_values, width=bar_width, label='Non-Fraud', color='blue', alpha=0.7)

    axes[i].set_title(metric_name)
    axes[i].set_xticks(x)
    axes[i].set_xticklabels(keywords, rotation=45, ha='right')
    axes[i].set_ylabel('Value')
    axes[i].legend()

plt.tight_layout()
plt.show()

In these plots we can see some Metrics of the Embeddings like Average Pairwise distance or Density... Its visible, that scam coins are a little bit denser but its not that much of a difference and not a overall rule.

In [22]:
def max_similar_embeddings_normalized(embeddings, similarity_threshold=0.95):
    similarity_matrix = cosine_similarity(embeddings)
    np.fill_diagonal(similarity_matrix, 0)
    similar_counts = np.sum(similarity_matrix > similarity_threshold, axis=1)
    return similar_counts.max() / len(embeddings)

unique_keywords = merged_df['searchkeyword'].unique()
similarity_data = []

for keyword in unique_keywords:
    keyword_df = merged_df[merged_df['searchkeyword'] == keyword]
    embeddings = np.vstack(keyword_df['embedding'].values)
    
    max_similar_count_normalized = max_similar_embeddings_normalized(embeddings, similarity_threshold=0.9)
    
    similarity_data.append({
        'keyword': keyword,
        'max_similar_embeddings_normalized': max_similar_count_normalized
    })
sorted_data = sorted(
    similarity_data,
    key=lambda x: x['max_similar_embeddings_normalized'],
    reverse=True
)

sorted_keywords = [entry['keyword'] for entry in sorted_data]
sorted_max_similar_values_normalized = [
    entry['max_similar_embeddings_normalized'] for entry in sorted_data
]

sorted_is_scam = [
    coins_info_df[coins_info_df['name'] == keyword]['fraud'].iloc[0]
    for keyword in sorted_keywords
]

sorted_colors = ['red' if scam else 'blue' for scam in sorted_is_scam]

plt.figure(figsize=(12, 6))
plt.bar(
    sorted_keywords,
    sorted_max_similar_values_normalized,
    color=sorted_colors,
    alpha=0.7
)
plt.title("Normierte maximale Anzahl ähnlicher Embeddings pro Coin (nach Wert sortiert)")
plt.xlabel("Coin")
plt.ylabel("Normierte maximale Anzahl ähnlicher Embeddings")
plt.xticks(rotation=45, ha='right')

red_patch = mpatches.Patch(color='red', label='Scam')
blue_patch = mpatches.Patch(color='blue', label='Non-Scam')
plt.legend(handles=[red_patch, blue_patch])

plt.tight_layout()
plt.show()

Here we can see the max embeddings which are near to ech other (cosine sim over 0.95) normalised with count of all embeddings. It is very clear visible, that Scam coins tend to have more similar embeddings than non scam coins. But again its not a very clear rule (Avalanche)